#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# QQ: 34538980@qq.com
# Jekkay Hu, 2013.5.5
# ///////////////////////////////////////////////////////////////////
#                            _ooOoo_                               //
#                           o8888888o                              //
#                           88" . "88                              //
#                           (| ^_^ |)                              //
#                           O\  =  /O                              //
#                        ____/`---'\____                           //
#                      .'  \\|     |//  `.                         //
#                     /  \\|||  :  |||//  \                        //
#                    /  _||||| -:- |||||-  \                       //
#                    |   | \\\  -  /// |   |                       //
#                    | \_|  ''\---/''  |   |                       //
#                    \  .-\__  `-`  ___/-. /                       //
#                  ___`. .'  /--.--\  `. . ___                     //
#                ."" '<  `.___\_<|>_/___.'  >'"".                  //
#              | | :  `- \`.;`\ _ /`;.`/ - ` : | |                 //
#              \  \ `-.   \_ __\ /__ _/   .-` /  /                 //
#        ========`-.____`-.___\_____/___.-`____.-'========         //
#                             `=---='                              //
#        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^        //
#             佛祖保佑       永无BUG        运行正常                   //
#     -------------------------------------------------------      //
#               QQ: 34538980@qq.com                                //
#               博客: http://www.easysb.cn                          //
#               Jekkay Hu, 2013.5.5                                //
# ///////////////////////////////////////////////////////////////////
#
# 工作扫描的主线程，从队列中获取扫描目标
# 1. 获取主页内容，提取新的域名
# 2. 指纹识别
#
import threading
import logging
from conf.context import context
from utility.function import extract_url_suffix, get_url, random_md5, get_file, calculate_md5, calculate_file_md5, \
    make_parent_folder, extract_title
from conf.const import SCAN_STATE_FINISH
from utility.target import Target
from scan.links_processor import LinksProcessor
import time
import re
from os import remove


class ScanWorkThread(threading.Thread):
    def __init__(self, domain_queue, result_domain_queue, result_target_queue):
        super(ScanWorkThread, self).__init__()
        self.domain_queue = domain_queue
        self.result_domain_queue = result_domain_queue
        self.result_target_queue = result_target_queue
        self.is_working = False
        # 设置为非Daemon
        self.setDaemon(False)
        # 退出的标志位
        self.exit_flag = False
        # 每次扫描暂停时间
        self.scan_interval = 0
        if context.setting['scan'] and context.setting['scan']['scan_interval'] \
                and context.setting['scan']['scan_interval'] > 0:
            logging.info('scan interval is [ %s ] ms' % context.setting['scan']['scan_interval'])
            self.scan_interval = context.setting['scan']['scan_interval'] / 1000.0

    def run(self):
        logging.info('start scan thread...')
        while not self.exit_flag:
            try:
                self.is_working = False
                if self.domain_queue.qsize() <= 0:
                    time.sleep(0.1)
                    continue
                domain = self.domain_queue.get()
                self.is_working = True
                if domain:
                    self.scan(domain)
                    logging.info('scan %s finish!' % domain['domain'])
                if self.scan_interval:
                    time.sleep(self.scan_interval)
            except Exception as e:
                logging.error('ScanWorkThread %s' % str(e))

    def scan(self, domain):
        # 构造域名响应对象
        response_context = DomainResponseContext(domain)
        # 请求主页
        try:
            response = response_context.get("/")
            if response:
                self._extract_url_list(response.html, response_context)
            # 规则检测
            if not context.setting['rules']:
                return
            for rule in context.setting['rules']:
                res = self._check_rule(response_context, rule)
                if res:
                    self._process_result(res)
                    break
        finally:
            # 发送当前的新域名列表
            self._push_new_links(response_context)
            # 清除空间
            response_context.clear()

    def _check_rule(self, response_context, rule):
        # 获取内容
        if not rule['url']:
            return
        response = response_context.get(rule['url'])
        if not response or response.is_empty():
            return
        # 如果有html内容，提取下url列表
        if response.html:
            self._extract_url_list(response.html, response_context)
        # 如果是正则的话
        match = self._check_rule_reg(response, rule['re']) if rule['re'] else None
        # 如果是md5
        if not match and rule['md5']:
            match = self._check_rule_md5(response, rule['md5'])
        if not match:
            return
        target = Target()
        target.id = random_md5()
        target.domain_id = response_context.domain['id']
        target.domain = response_context.domain['domain']
        target.target = rule['name']
        target.title = extract_title(response_context.get("/").html)
        target.match = '%s | %s' % (match, rule['url'])
        return target

    def _check_rule_reg(self, response, rule_reg):
        if not response.html:
            self._read_cache_file(response)
        if response.html:
            matches = re.findall('(\S{0,40}%s\S{0,40})' % rule_reg, str(response.html), re.I)
            return str(matches[0]) if matches else None

    def _read_cache_file(self, response):
        if not response.file_path:
            return
        try:
            response.html = open(response.file_path, mode='r', encoding='utf-8').read()
        except Exception as e:
            logging.error("fail to read cache file %s, %s" % (response.file_path, str(e)))

    # md5优先计算文件的md5
    def _check_rule_md5(self, response, rule_md5):
        if response.file_path:
            cur_md5 = calculate_file_md5(response.file_path)
        if not cur_md5 and response.html:
            cur_md5 = calculate_md5(response.html)
        return cur_md5 if cur_md5 == rule_md5 else None

    def _extract_url_list(self, html, response_context):
        if not self.result_domain_queue:
            return
        links = LinksProcessor.extract_distinct_links(html)
        if links:
            links = set(list(map(lambda x: LinksProcessor.strip_link_path(x), links)))
            response_context.add_new_link(*links)

    def _push_new_links(self, response_context):
        if not self.result_domain_queue:
            return
        # 更新当前域名的标题
        title = extract_title(response_context.get("/").html)
        if not title:
            title = ''
        d = LinksProcessor.wrap_domain(response_context.domain, title)
        response_context.domain['title'] = title
        response_context.domain['scan'] = SCAN_STATE_FINISH
        logging.info('> process [ %s] , [ %s ]', d.domain, title)
        self.result_domain_queue.put(d)

        if response_context.new_links_list:
            for link in response_context.new_links_list:
                d = LinksProcessor.convert_link_to_domain(link)
                if d:
                    logging.info('find new domain [ %s ]' % d.domain)
                    self.result_domain_queue.put(d)

    def _process_result(self, target):
        logging.info('[success] find target, [ %s ], [ %s ], %s', target.domain, target.target, target.match)
        if self.result_target_queue:
            self.result_target_queue.put(target)


class DomainResponseContext(object):
    def __init__(self, domain):
        self.domain = domain
        self.response_cache = {}
        self.new_links_list = []

    def get(self, request_url):
        if not request_url.startswith('/'):
            request_url = '/' + request_url
        if request_url in self.response_cache:
            return self.response_cache[request_url]
        url = self._create_url(request_url)
        response = self._get_response(url)
        # 加入到缓存系统中
        self.response_cache[request_url] = response
        return response

    # 构建完整的url请求
    def _create_url(self, request_url):
        # 优先使用http
        return '%s://www.%s%s' % (('http' if self.domain['is_http'] else 'https'), self.domain['domain'], request_url)

    def _get_response(self, url):
        suffix = extract_url_suffix(url)
        if not suffix or suffix in context.setting['no_cache_file_suffix']:
            return Response(html=get_url(url))
        if suffix:
            suffix = '.' + suffix
        cache_file_path = '%s/%s/%s%s' % (context.setting['cache_folder'], time.strftime("%Y/%m/%d", time.localtime()),
                                          random_md5(), suffix)
        if get_file(url, cache_file_path):
            return Response(file_path=cache_file_path)
        return Response()

    def add_new_link(self, *links):
        if not links:
            return
        for link in links:
            link = LinksProcessor.strip_link_path(link)
            if link and link.find(self.domain['domain']) < 0 and self.new_links_list.count(link) == 0:
                self.new_links_list.append(link)

    def clear(self):
        if self.response_cache:
            for v in self.response_cache.values():
                if v.file_path:
                    remove(v.file_path)
            self.response_cache.clear()
            self.response_cache = None
        if self.new_links_list:
            self.new_links_list.clear()
            self.new_links_list = None


class Response(object):
    def __init__(self, html=None, file_path=None):
        self.html = html
        self.file_path = file_path

    def is_empty(self):
        return not self.html and not self.file_path
